Objective

The following code generates the unsupervised hierarchical clustering heatmap for the KSPZV1 dataset using the ComplexHeatmap package. Clustering analysis is performed to test for statistical differences between Sample Clusters (SC) for outcomes as well as relevant variables (specifically SC2 and SC1,3,4). In addition, row clustering generates Gene Clusters (GC), and GSEA using blood transcription modules is then applied to genes within GC4 ranked by variance.

Final options for delta: allgroups filter pre-seqmonk < 7.5M MADS = 25% clustering: ward.D2 and euclidean, 4 clusters for columns, 5 clusters for genes

library(ComplexHeatmap)
library(miscTools)
library(edgeR)
library(Biobase)
library(tidyverse)
library(googledrive)
library(data.table)
library(ggpubr)
library(fgsea)
library(devtools)
library(knitr)
myBatch <- "both" 
myGroup <- "allGroups" 
myTimepoint <- "delta" 
ClusterCols <- TRUE
col.hclust.method <- "ward.D2" 
col.dist.method <- "euclidean" 
row.hclust.method <- "ward.D2" 
row.dist.method <- "euclidean" 
PCT <- 25 
filter.type <- paste("mads",PCT,"pct",sep="")
noColClust <- 4
noRowClust <- 5

Load ExpressionSet

temp <- tempfile(fileext = ".rds")
dl <- drive_download(
  as_id("17xx0YaggLiyWdJc9rqA1nkPSE7dj05p0"), path = temp, overwrite = TRUE)
x <- readRDS(file = dl$local_path)
x$treat <- factor(x$treat, levels = c("Placebo", "4.5 x 10^5 PfSPZ", "9.0 x 10^5 PfSPZ", "1.8 x 10^6 PfSPZ"))

Add additional baseline data

#CBC data
temp <- tempfile(fileext = ".xlsx")
dl <- drive_download(
  as_id("1df0pHJAlFteRIcQuJql3U2IZFe5SpZdB"), path = temp, overwrite = TRUE)
CBCdat <- readxl::read_excel(dl$local_path) %>%
  mutate(DaysPriorVax1 = as.integer(gsub(" \\(Dose 1\\)", "", .$`Days Post Vaccination`))*-1,
         Hemoglobin = as.numeric(Hemoglobin)) %>%
  dplyr::select(PATID, DaysPriorVax1, Hemoglobin, Platelets, WBC, Neutrophils, Percent.Neutrophils)
pData(x) <- pData(x) %>%
  left_join(., CBCdat, by = "PATID")

## weight data
temp <- tempfile(fileext = ".csv")
dl <- drive_download(
  as_id("1hvjTuBKWIYKmDrOdiogGqcHPJUUPZM4P"), path = temp, overwrite = TRUE)
WeightDat <- read_csv(dl$local_path) %>%
  dplyr::select(PATID, zwei, zwfl, zbmi) #for definitions: https://cran.r-project.org/web/packages/anthro/anthro.pdf
pData(x) <- pData(x) %>%
  left_join(., WeightDat, by = "PATID") 

#add pre-vax cluster information
temp <- tempfile(fileext = ".csv")
dl <- drive_download(
  as_id("1LtD4PeZZ34dCnghGBikY9wTs9CtsebKo"), path = temp, overwrite = TRUE)
pData(x) <- read_csv(dl$local_path) %>%
  dplyr::select(PATID, Cluster) %>%
  dplyr::rename(PrevaxCluster = Cluster) %>%
  left_join(pData(x), ., by = "PATID")
rownames(pData(x)) <- pData(x)$PATID
all.pDat <- pData(x)

##Filtering features

dim(x)
## Features  Samples 
##    23768      230
if(PCT < 100){
  madsno <- as.integer(nrow(x)*(PCT/100))
  mads <- apply(exprs(x), 1, mad)  #mad filtering
  x <- x[mads>sort(mads, decr=TRUE)[madsno],]
}
dim(x)
## Features  Samples 
##     5941      230

Make annotation dataframe and then build heatmap annotation object using HeatMapAnnotation function

For details on how to make annotations, refer to: https://jokergoo.github.io/ComplexHeatmap-reference/book/

#full
annot.df <- as.data.frame(cbind(
  "Gender" = as.character(factor(x$SEX, levels = c("M","F"), labels = c("male","female"))),
  "Age" = as.character(x$age.vax1),
  "Pf at first vax" = as.character(factor(x$mal.vax.1, levels = c(0,1), labels = c("neg","pos"))),
  "# Pf episodes during vax" = as.character(x$mal.dvax.tot),
  "CSP IgG pre-vax" = as.character(log10(x$pfcsp_pre+1)),
  "CSP IgG response (log2 fold change)" = as.character(x$log2FC_CSPAb),
  "Days to first parasitemia post-vax" = as.character(x$tte.mal.atp.6),
  "Outcome" = as.character(factor(x$mal.atp.3, levels = c(0,1), labels = c("protected","unprotected"))),
  "Dose" = as.character(x$treat)
  ))
annot.df <- rev(annot.df)
for(i in c(1:ncol(annot.df))){
  annot.df[,i] <- as.character(annot.df[,i])
  }
annot.df$Age <- as.numeric(annot.df$Age)
annot.df$`Days to first parasitemia post-vax` <- as.numeric(annot.df$`Days to first parasitemia post-vax`)
annot.df$`# Pf episodes during vax` <- as.numeric(annot.df$`# Pf episodes during vax`)
annot.df$`CSP IgG response (log2 fold change)` <- as.numeric(annot.df$`CSP IgG response (log2 fold change)`)
annot.df$`CSP IgG pre-vax` <- as.numeric(annot.df$`CSP IgG pre-vax`)
rownames(annot.df) <- colnames(x)
clab <- list(
  "Gender" = c("male" = "#E5E5E5", "female" = "#191919"),
  "Age" = circlize::colorRamp2(c(min(all.pDat$age.vax1, na.rm=T), max(all.pDat$age.vax1, na.rm=T)), c("#E5E5E5", "#333333")),
  "Pf at first vax" = c("neg" = "#D1D3D3", "pos" = "#b2182b"),
  "# Pf episodes during vax" = circlize::colorRamp2(c(min(all.pDat$mal.dvax.tot, na.rm=T), max(all.pDat$mal.dvax.tot, na.rm=T)),
                                                    c("#fee5d9", "#a50f15")),
  "CSP IgG pre-vax" = circlize::colorRamp2(c(min(log10(all.pDat$pfcsp_pre+1), na.rm=T), max(log10(all.pDat$pfcsp_pre+1), na.rm=T)), c("#f2f0f7", "#54278f")),
  "CSP IgG response (log2 fold change)" = circlize::colorRamp2(c(-max(abs(all.pDat$log2FC_CSPAb), na.rm=T), 0, max(abs(all.pDat$log2FC_CSPAb), na.rm=T)), c("#656566", "white", "#54278f")),
  "Days to first parasitemia post-vax" = circlize::colorRamp2(c(min(all.pDat$tte.mal.atp.6, na.rm=T), max(all.pDat$tte.mal.atp.6, na.rm=T)), c("#d4defc", "#112663")),
  "Outcome" = c("protected" = "#1F78B4", "unprotected" = "#A6CEE3"),
  "Dose" = c("Placebo" = "#808080", "4.5 x 10^5 PfSPZ" = "#fdcc8a", "9.0 x 10^5 PfSPZ" = "#fc8d59", "1.8 x 10^6 PfSPZ" = "#d7301f")
  )
clab <- rev(clab)
ha <- HeatmapAnnotation(df = annot.df, col = clab, na_col = "lavender", annotation_name_gp = gpar(fontsize = 9.5), annotation_height = 0.25, annotation_name_side = "left", 
        simple_anno_size = unit(0.333, "cm"))

Scaling

colnames(x) <- colnames(exprs(x))
unscaled_logCPM <- x
ScaleData <- FALSE
if(myTimepoint == "delta"){
  myScaleData <- "logFC"
  myHeatlegend.title <- "log2 fold-change over pre-vax"
}

Clustering and Distance options

This chunk takes a while if you run the full gene set so be patient.

#https://jokergoo.github.io/ComplexHeatmap-reference/book/a-single-heatmap.html
#See options for variables
hclust.col <- cutree(stats::hclust(dist(t(exprs(x)), method = col.dist.method), method = col.hclust.method), k = noColClust)
hclust.row <- cutree(stats::hclust(dist(exprs(x), method = row.dist.method), method = row.hclust.method), k = noRowClust)

Build heatmap object

set.seed(12345)
x$treat <- droplevels(x$treat)
x$dummytreatlevels <- factor(x$treat, levels = c("Placebo",
                                                   "4.5 x 10^5 PfSPZ",
                                                   "9.0 x 10^5 PfSPZ",
                                                   "1.8 x 10^6 PfSPZ"),
                               labels = c(1,2,3,4))
hm <- Heatmap(exprs(x),
              name = "mat",
              column_title = "subjects",
              column_title_side = "bottom",
              show_row_dend = FALSE,
              row_title = paste(filter.type, nrow(x), "genes"),
              row_title_side = "left",
              row_names_side = "left",
              row_names_gp = gpar(fontsize = 8),
              row_names_max_width = unit(9, "cm"),
              cluster_columns = TRUE,
              cluster_rows = TRUE,
              column_split = paste(x$dummytreatlevels, hclust.col, sep = "_"),
              row_split = paste("gene cluster ",hclust.row, sep = "_"),
              show_row_names = FALSE,
              top_annotation = ha, heatmap_legend_param = list(title = myHeatlegend.title, color_bar = "continuous", legend_direction = "horizontal"),
              show_column_names=FALSE,
              cluster_column_slices = FALSE,
              cluster_row_slices = FALSE
              )

Extract clustering information

see https://github.com/jokergoo/ComplexHeatmap/issues/136

Note: may have to run this chunk directly in console if you get the following error: Error: The width or height of the raster image is zero, maybe you forget to turn off the previous graphic device or it was corrupted. Run dev.off() to close it.

hm <- draw(hm)

c.dend <- column_dend(hm)  #Extract col dendrogram
ccl.list <- column_order(hm)  #Extract clusters (output is a list)
#lapply(ccl.list, function(x) length(x))  #check/confirm size clusters

#loop to extract samples for each cluster.
for (i in 1:length(column_order(hm))){
  if (i == 1) {
    clu <- t(t(colnames(exprs(x)[,column_order(hm)[[i]], drop = FALSE]))) #one cluster had a single column, so have to add drop = FALSE argument to prevent the matrix from becoming a vector
    out <- cbind(clu, paste("cluster", i, sep=""))
    colnames(out) <- c("SampleID", "Cluster")
    } else {
      clu <- t(t(colnames(exprs(x)[,column_order(hm)[[i]], drop = FALSE])))
      clu <- cbind(clu, paste("cluster", i, sep=""))
      out <- rbind(out, clu)
    }
}

## Merge with pData(x)
pData.cluster <- pData(x) %>%
  rownames_to_column(var = "SampleID") %>%
  left_join(., as_tibble(out), by = "SampleID") %>%
  column_to_rownames(var = "SampleID") %>%
  mutate(Cluster = gsub("cluster", "SC", .$Cluster)) %>%
  mutate(Cluster.pam4.cl16 = Cluster) %>%
  mutate(Cluster = gsub("6", "2", .$Cluster)) %>%
  mutate(Cluster = gsub("10", "2", .$Cluster)) %>%
  mutate(Cluster = gsub("14", "2", .$Cluster)) %>%
  mutate(Cluster = ifelse(.$Cluster != "SC2", "SC1,3,4", .$Cluster)) %>%
  filter(Cluster == "SC2" | Cluster == "SC1,3,4")

#sanity checks
table(pData.cluster$Cluster,pData.cluster$treat)
##          
##           Placebo 4.5 x 10^5 PfSPZ 9.0 x 10^5 PfSPZ 1.8 x 10^6 PfSPZ
##   SC1,3,4      51               54               46               48
##   SC2          10                1                7               13
table(pData.cluster$Cluster.pam4.cl16,pData.cluster$treat)
##       
##        Placebo 4.5 x 10^5 PfSPZ 9.0 x 10^5 PfSPZ 1.8 x 10^6 PfSPZ
##   SC1       35                0                0                0
##   SC10       0                0                7                0
##   SC11       0                0               17                0
##   SC12       0                0                4                0
##   SC13       0                0                0               28
##   SC14       0                0                0               13
##   SC15       0                0                0               17
##   SC16       0                0                0                3
##   SC2       10                0                0                0
##   SC3       14                0                0                0
##   SC4        2                0                0                0
##   SC5        0               36                0                0
##   SC6        0                1                0                0
##   SC7        0               12                0                0
##   SC8        0                6                0                0
##   SC9        0                0               25                0

Plot stats based on clusters

basefont <- "sans"
basefontsize <- 9
pvalfontsize <- 4
mySignifLabel <- "p.signif"
symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1), symbols = c("****", "***", "**", "*", "trend","ns"))

#outcome
foo1 <- c(1,2,3,4)  
names(foo1) <-  names(summary(pData.cluster$treat))
for(i in names(summary(pData.cluster$treat))){
  pData.cluster.temp <- pData.cluster %>%
    filter(treat == i)
  foo1[i] <- signif(fisher.test(table("Cluster" = pData.cluster.temp$Cluster, "Outcome" = pData.cluster.temp$mal.atp.3))$p.value,2)
  }
dat_text <- data.frame(treat = factor(levels(pData.cluster$treat), levels = levels(pData.cluster$treat)), label = foo1, Cluster = levels(factor(pData.cluster.temp$Cluster)), x = 1.5, y = 25,
                         Outcome = c("not protected (infected)", "protected (uninfected)"))

Outcome.counts <- pData.cluster %>%
  mutate(Cluster = factor(Cluster)) %>%
  mutate(Outcome = factor(mal.atp.3, levels = c(1,0), labels = c("not protected (infected)", "protected (uninfected)"))) %>%
  ggplot(., aes(x=Cluster, fill = Outcome)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  ylab("Outcome (count)") +
  facet_wrap(.~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.background = element_rect(fill="#fef0d9"),
        legend.position="none"
        ) +
  geom_text(
    data = dat_text,
    mapping = aes(x = x, y = y, label = label))

#Days to first parasitemia
TTE.boxplot <- pData.cluster %>%
  mutate(Outcome = factor(mal.atp.3, levels = c(1,0), labels = c("not protected (infected)", "protected (uninfected)"))) %>%
  ggplot(., aes(Cluster, tte.mal.atp.6, fill = Outcome, color = Outcome)) +
  geom_point(position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.4, color = "grey30", outlier.shape = NA) +
  stat_compare_means(aes(group = Outcome), label = "p.format", method = "wilcox.test", label.x.npc = "center", vjust = 1) +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  scale_color_manual(values = c("#A6CEE3", "#1F78B4")) +
  ylab("Days to first parasitemia") +
  ylim(c(0,180)) +
  facet_wrap(~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.background = element_rect(fill="#d7301f"),
        legend.position="none"
        )

#CSP-specific IgG at pre-vax
CSPbl.boxplot <- pData.cluster %>%
  mutate(Outcome = factor(mal.atp.3, levels = c(1,0), labels = c("not protected (infected)", "protected (uninfected)"))) %>%
  dplyr::filter(!(is.na(pfcsp_pre) | is.na(Outcome))) %>% 
  ggplot(., aes(Cluster, log10(pfcsp_pre+1), fill = Outcome, color = Outcome)) +
  geom_point(position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.4, color = "grey30", outlier.shape = NA) +
  stat_compare_means(aes(group = Outcome), label = "p.format", method = "wilcox.test", label.x.npc = "center", vjust = 1, symnum.args = symnum.args) +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  scale_color_manual(values = c("#A6CEE3", "#1F78B4")) +
  ylab("CSP-specific IgG at pre-vax") +
  facet_wrap(~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.text.x = element_blank(),
        strip.background = element_blank(),
        legend.position="none"
        )

#CSP-specific IgG log2FC
CSPlogfc.boxplot <- pData.cluster %>%
  mutate(Outcome = factor(mal.atp.3, levels = c(1,0), labels = c("not protected (infected)", "protected (uninfected)"))) %>%
  dplyr::filter(!(is.na(log2FC_CSPAb) | is.na(Outcome))) %>% 
  ggplot(., aes(x = Cluster, y = log2FC_CSPAb, fill = Outcome, color = Outcome)) +
  geom_point(position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.4, color = "grey30", outlier.shape = NA) +
  stat_compare_means(aes(group = Outcome), label = "p.format", method = "wilcox.test", label.x.npc = "center", vjust = 1, symnum.args = symnum.args) +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  scale_color_manual(values = c("#A6CEE3", "#1F78B4")) +
  ylab("CSP-specific IgG (log fold change)") +
  ylim(c(-12.5,25)) +
  facet_wrap(~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.text.x = element_blank(),
        strip.background = element_blank(),
        legend.position="none"
        )

#Pf at first vax
foo1 <- c(1,2,3,4)  
names(foo1) <-  names(summary(pData.cluster$treat))
pData.cluster <- pData.cluster %>%
  mutate(Cluster = factor(Cluster)) %>%
  mutate(Pf = factor(mal.vax.1, levels = c(1,0), labels = c("pos", "neg")))
for(i in names(summary(pData.cluster$treat))){
  pData.cluster.temp <- pData.cluster %>%
    filter(treat == i)
  foo1[i] <- signif(fisher.test(table("Cluster" = pData.cluster.temp$Cluster, "Pf at Vax1" = pData.cluster.temp$Pf))$p.value,2)
  }
dat_text <- data.frame(treat = factor(levels(pData.cluster$treat), levels = levels(pData.cluster$treat)), label = foo1, Cluster = levels(factor(pData.cluster.temp$Cluster)),
                       x = 1.5, y = 45,
                       Pf = c("pos","neg"))            
Pf.counts <-  pData.cluster %>%
  ggplot(., aes(x=Cluster, fill = Pf)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("#b2182b", "#D1D3D3")) +
  ylab("Pf at first vax (counts)") +
  facet_wrap(~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.text.x = element_blank(),
        strip.background = element_blank(),
        legend.position="none"
        ) +
  geom_text(data = dat_text,mapping = aes(x = x, y = y, label = label))

#Pf episodes during vax period
mal.dvax.tot.boxplot <- pData.cluster %>%
  mutate(Outcome = factor(mal.atp.3, levels = c(1,0), labels = c("not protected (infected)", "protected (uninfected)"))) %>%
  ggplot(., aes(Cluster, mal.dvax.tot, fill = Outcome, color = Outcome)) +
  geom_point(position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.4, color = "grey30", outlier.shape = NA) +
  stat_compare_means(aes(group = Outcome), label = "p.signif", method = "wilcox.test", label.x.npc = "center", vjust = 1, symnum.args = symnum.args) +
  scale_fill_manual(values = c("#A6CEE3", "#1F78B4")) +
  scale_color_manual(values = c("#A6CEE3", "#1F78B4")) +
  ylab("# Pf episodes during vax") +
  ylim(c(0,6.5)) +
  facet_wrap(~treat, ncol = 4) +
  theme_classic(base_family = basefont, base_size = basefontsize) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.x=element_blank(),
        strip.text.x = element_blank(),
        strip.background = element_blank(),
        legend.position="none"
        )

Apply GSEA to Genes within Cluster ranked by mean variance across all samples

set.seed(23)
ranks <- deframe(cl24.ranklist)
devtools::source_url("https://github.com/TranLab/ModuleLists/blob/main/NamedGeneRankList2GseaTable.R?raw=TRUE")
GSEA_delta_bound_df <- NamedGeneRankList2GseaTable(rankedgenes = ranks, geneset = "bloodmodules", output_directory = tempdir(), filename_prefix = paste0("PfSPZ_GSEA_", myTimepoint,
"_GeneCluster4_minSize5"), scoreType = "pos", minSize = 5, fixed_seed = TRUE)
## [1] "Running fgsea for MonacoModules signatures"
## [1] "Running fgsea for highBTMs signatures"
## [1] "Running fgsea for lowBTMs signatures"
## [1] "Running fgsea for BloodGen3Module signatures"

Make GSEA plot for Figure 4D of pre-print

GSEA of genes within the most variable cluster GC4 ranked by mean variance across all samples (minimum gene set size = 5, only modules with BH-adjusted p<0.20 shown)

myGSEAClusterPlotDat <- readxl::read_excel(paste0(tempdir(),
                                                  "PfSPZ_GSEA_", myTimepoint,"_GeneCluster4_minSize5",
                                                  " GSEA results bloodmodules.xlsx")) %>%
  filter(padj < 0.20) %>%
  mutate(neglogpadj = -log10(padj)) %>%
  mutate(pathway = gsub("VS", "v", pathway)) %>%
  mutate(pathway = gsub("Vd", "Vδ", pathway)) %>%
  mutate(pathway = gsub("gd", "γδ", pathway)) %>%
  mutate(pathway = sub(".*?\\_", "", pathway)) %>%
  dplyr::filter(!grepl("NEURON", pathway)) %>%
  mutate(pathway = fct_reorder(pathway, desc(NES))) %>%
  mutate(TextLabelColor = ifelse(module_type == "lowBTMs", scales::muted("red"),
                                 ifelse(module_type == "highBTMs", scales::muted("blue"),
                                        ifelse(module_type == "MonacoModules", "black","gray")))) %>%
  arrange(desc(NES)) 
myGSEAClusterPlot <- myGSEAClusterPlotDat %>% 
  ggplot(., aes(x = NES, y = pathway, fill = neglogpadj)) +
  geom_bar(stat = 'identity') + 
  viridis::scale_fill_viridis(option= "A", begin = 0.25, end = 0.75, alpha = 0.8, direction = -1, name = "-logadjP") +
  theme_classic(base_family = "sans", base_size = 6) +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5,
                                   colour = myGSEAClusterPlotDat$TextLabelColor),
        legend.position = "bottom") +
  coord_flip()
addSmallLegend <- function(myPlot, pointSize = 2, textSize = 4, spaceLegend = 0.6) {
    myPlot +
        guides(shape = guide_legend(override.aes = list(size = pointSize)),
               color = guide_legend(override.aes = list(size = pointSize))) +
        theme(legend.title = element_text(size = textSize), 
              legend.text  = element_text(size = textSize),
              legend.key.size = unit(spaceLegend, "lines"))
}
addSmallLegend(myGSEAClusterPlot)